File set-up

Set working directory to current directory

if (rstudioapi::isAvailable()) {
  setwd(dirname(rstudioapi::getActiveDocumentContext()$path))
}

Load standard libraries and resolve conflicts

library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6      ✔ purrr   0.3.4 
## ✔ tibble  3.1.8      ✔ dplyr   1.0.10
## ✔ tidyr   1.2.0      ✔ stringr 1.4.1 
## ✔ readr   2.1.2      ✔ forcats 0.5.2 
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(conflicted)
conflict_prefer("filter", "dplyr")
## [conflicted] Will prefer dplyr::filter over any other package
conflict_prefer("select", "dplyr")
## [conflicted] Will prefer dplyr::select over any other package
conflict_prefer("slice", "dplyr")
## [conflicted] Will prefer dplyr::slice over any other package
conflict_prefer("rename", "dplyr")
## [conflicted] Will prefer dplyr::rename over any other package
conflict_prefer('intersect', 'dplyr')
## [conflicted] Will prefer dplyr::intersect over any other package

Read data

cq = read_tsv('../data/Supplementary_Table_3_selected_circRNAs.txt')
## Rows: 1560 Columns: 44
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (24): chr, strand, circ_id, circ_id_strand, tool, cell_line, FWD_primer,...
## dbl (20): start, end, FWD_pos, FWD_length, REV_pos, REV_length, FWD_TM, REV_...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
cq

Presence in db (n = 890)

make count table

cont_table = cq %>%
  select(circ_id, cell_line, qPCR_val, RR_val, amp_seq_val, count_group, n_db) %>%
  unique() %>%
  # only keep high abundant circRNAs
  filter(count_group == 'count ≥ 5') %>%
  # to use all val together
  filter(!is.na(amp_seq_val), !is.na(RR_val)) %>%
  mutate(all_val = ifelse(qPCR_val == RR_val & qPCR_val == amp_seq_val & qPCR_val == 'pass', 'pass', 'fail')) %>%
  # change nr of databases to binary
  mutate(n_db = ifelse(is.na(n_db), 0, 1)) %>%
  group_by(n_db) %>%
  count(all_val) %>%
  pivot_wider(values_from = n, names_from = all_val) %>%
  ungroup() %>%
  select(-n_db)


cont_table = as.data.frame(cont_table)
rownames(cont_table) <- c("novel", "in_db")

cont_table

plot data

mosaicplot(cont_table,
           main = "Mosaic plot",
           color = TRUE)

if chisquare expected < 5 => then Fisher should be used

chisq.test(cont_table)$expected
##           fail     pass
## novel 16.23933 132.7607
## in_db 80.76067 660.2393

chisquare test

chisq.test(cont_table)
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  cont_table
## X-squared = 162.61, df = 1, p-value < 2.2e-16

OR

OR = (705/36)/(88/61)
OR
## [1] 13.57481

Canonical splicing (n = 722)

cont_table = cq %>%
  filter(count_group == 'count ≥ 5',
         !strand == 'unknown') %>%
  select(circ_id, cell_line, qPCR_val, RR_val, amp_seq_val, ss_motif) %>%
  unique() %>%
  mutate(ss_can = ifelse(ss_motif == "AGNGT", 1, 0)) %>%
  # to use all val together
  filter(!is.na(amp_seq_val), !is.na(RR_val)) %>%
  mutate(all_val = ifelse(qPCR_val == RR_val & qPCR_val == amp_seq_val & qPCR_val == 'pass', 
                          'pass', 'fail')) %>%
  # change nr of databases to binary
  group_by(ss_can) %>%
  count(all_val) %>%
  pivot_wider(values_from = n, names_from = all_val) %>%
  ungroup() %>%
  select(-ss_can)


cont_table = as.data.frame(cont_table)
rownames(cont_table) <- c("non_cannonical", "cannonical")

cont_table

plot data

mosaicplot(cont_table,
           main = "Mosaic plot",
           color = TRUE)

if chisquare expected < 5 => then Fisher should be used

chisq.test(cont_table)$expected
##                    fail      pass
## non_cannonical 11.99169  99.00831
## cannonical     66.00831 544.99169

chisquare test

chisq.test(cont_table)
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  cont_table
## X-squared = 42.044, df = 1, p-value = 8.923e-11

OR

OR = (565/46)/(79/32)
OR
## [1] 4.975234

Known annotation (n = 891)

cont_table = cq %>%
  filter(count_group == "count ≥ 5") %>%
  select(circ_id, cell_line, qPCR_val, RR_val, amp_seq_val, ENST) %>%
  unique() %>%
  # to use all val together
  filter(!is.na(amp_seq_val), !is.na(RR_val)) %>%
  mutate(all_val = ifelse(qPCR_val == RR_val & qPCR_val == amp_seq_val & qPCR_val == 'pass', 
                          'pass', 'fail'),
         ENST_group = ifelse(is.na(ENST), 'no_match', "match")) %>% 
  # change nr of databases to binary
  group_by(ENST_group) %>%
  count(all_val) %>%
  pivot_wider(values_from = n, names_from = all_val) %>%
  ungroup() %>%
  select(-ENST_group)


cont_table = as.data.frame(cont_table)
rownames(cont_table) <- c("match", "no_match")

cont_table

plot data

mosaicplot(cont_table,
           main = "Mosaic plot",
           color = TRUE)

if chisquare expected < 5 => then Fisher should be used

chisq.test(cont_table)$expected
##               fail      pass
## match    87.637486 717.36251
## no_match  9.362514  76.63749

chisquare test

chisq.test(cont_table)
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  cont_table
## X-squared = 203.2, df = 1, p-value < 2.2e-16

OR

OR = (757/48)/(37/49)
OR
## [1] 20.8857

CircRNA detection tool approach (n = 798)

cont_table = cq %>%
  filter(count_group == "count ≥ 5") %>%
  left_join(read_tsv('../data/details/tool_annotation.txt')) %>%
  select(circ_id, cell_line, qPCR_val, RR_val, amp_seq_val, approach) %>%
  filter(!approach == 'integrative') %>%
  unique() %>%
  # to use all val together
  filter(!is.na(amp_seq_val), !is.na(RR_val)) %>%
  mutate(all_val = ifelse(qPCR_val == RR_val & qPCR_val == amp_seq_val & qPCR_val == 'pass', 
                          'pass', 'fail')) %>% 
  # change nr of databases to binary
  group_by(approach) %>%
  count(all_val) %>%
  pivot_wider(values_from = n, names_from = all_val) %>%
  ungroup() %>%
  select(-approach)
## Rows: 16 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (7): tool, approach, tool_lt, lin_annotation, strand_anno, splicing, BSJ...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Joining, by = "tool"
cont_table = as.data.frame(cont_table)
rownames(cont_table) <- c("candidate-based", "segmented read-based")

cont_table

plot data

mosaicplot(cont_table,
           main = "Mosaic plot",
           color = TRUE)

if chisquare expected < 5 => then Fisher should be used

chisq.test(cont_table)$expected
##                          fail     pass
## candidate-based      21.54762 159.4524
## segmented read-based 73.45238 543.5476

chisquare test

chisq.test(cont_table)
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  cont_table
## X-squared = 6.8785, df = 1, p-value = 0.008724

OR

OR = (170/11)/(533/84)
OR
## [1] 2.435613

BSJ count group (n = 1042)

cont_table = cq %>%
  filter(!count_group == "no_counts") %>%
  select(circ_id, cell_line, qPCR_val, RR_val, amp_seq_val, count_group) %>%
  unique() %>%
  # to use all val together
  filter(!is.na(amp_seq_val), !is.na(RR_val)) %>%
  mutate(all_val = ifelse(qPCR_val == RR_val & qPCR_val == amp_seq_val & qPCR_val == 'pass', 
                          'pass', 'fail')) %>%
  # change nr of databases to binary
  group_by(count_group) %>%
  count(all_val) %>%
  pivot_wider(values_from = n, names_from = all_val) %>%
  ungroup() %>%
  select(-count_group)


cont_table = as.data.frame(cont_table)
rownames(cont_table) <- c("count < 5", "count ≥ 5")

cont_table

plot data

mosaicplot(cont_table,
           main = "Mosaic plot",
           color = TRUE)

if chisquare expected < 5 => then Fisher should be used

chisq.test(cont_table)$expected
##                fail     pass
## count < 5  19.83877 132.1612
## count ≥ 5 116.16123 773.8388

chisquare test

chisq.test(cont_table)
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  cont_table
## X-squared = 23.636, df = 1, p-value = 1.164e-06

OR

OR = (793/97)/(113/39)
OR
## [1] 2.821549

Using sensitivity

tool_anno = read_tsv('../data/details/tool_annotation.txt')
## Rows: 16 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (7): tool, approach, tool_lt, lin_annotation, strand_anno, splicing, BSJ...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
val = read_tsv('../data/Supplementary_Table_4_precision_values.txt')
## Rows: 29 Columns: 16
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr  (2): tool, count_group
## dbl (14): nr_qPCR_total, nr_qPCR_fail, nr_qPCR_val, nr_RR_total, nr_RR_fail,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

add annotation to sensitivity

sens_anno = val %>% 
  group_by(tool) %>%
  slice(1) %>%
  select(tool, sensitivity) %>%
  left_join(tool_anno) %>%
  select(-tool_lt) %>% ungroup()
## Joining, by = "tool"
sens_anno

Detection approach: segmented read-based VS candidate-based

wilcox.test(sensitivity ~ approach, data=sens_anno %>% filter(!approach == 'integrative')) 
## 
##  Wilcoxon rank sum exact test
## 
## data:  sensitivity by approach
## W = 6, p-value = 0.1264
## alternative hypothesis: true location shift is not equal to 0
kruskal.test(sensitivity ~ approach, data=sens_anno)
## 
##  Kruskal-Wallis rank sum test
## 
## data:  sensitivity by approach
## Kruskal-Wallis chi-squared = 3.3209, df = 2, p-value = 0.1901

Detection based on annotation: known splice sites VS entire genome

wilcox.test(sensitivity ~ lin_annotation, data=sens_anno %>% filter(!is.na(lin_annotation))) 
## 
##  Wilcoxon rank sum exact test
## 
## data:  sensitivity by lin_annotation
## W = 20, p-value = 0.8396
## alternative hypothesis: true location shift is not equal to 0

Strand annotation method

kruskal.test(sensitivity ~ strand_anno, data=sens_anno %>% 
               filter(!is.na(strand_anno), !strand_anno == "no strand reported"))
## 
##  Kruskal-Wallis rank sum test
## 
## data:  sensitivity by strand_anno
## Kruskal-Wallis chi-squared = 4.8758, df = 3, p-value = 0.1811

Splicing: canonical VS non-canonical

wilcox_ss = wilcox.test(sensitivity ~ splicing, data=sens_anno) 

wilcox_ss
## 
##  Wilcoxon rank sum exact test
## 
## data:  sensitivity by splicing
## W = 55, p-value = 0.0004579
## alternative hypothesis: true location shift is not equal to 0
sens_anno %>%
  group_by(splicing) %>%
  summarise(mean_sens = mean(sensitivity))

method 1

N = nrow(sens_anno)
N
## [1] 16
Z = qnorm(wilcox_ss$p.value/2)
Z
## [1] -3.504262
r = abs(Z)/sqrt(N)
r
## [1] 0.8760654

method 2

library(rstatix)
library(ggpubr)
sens_anno %>% rstatix::wilcox_test(sensitivity ~ splicing)
sens_anno %>% rstatix::wilcox_effsize(sensitivity ~ splicing)
#sens_anno %>% rstatix::wilcox_effsize(sensitivity ~ splicing, ci = T)

Median difference in sensitivity

median_diff = tibble()

for (tool_can in sens_anno %>% filter(splicing == "canonical") %>% pull(tool)) {
  sens_can = sens_anno %>% filter(tool == tool_can) %>% pull(sensitivity)
  
  for (tool_non_can in sens_anno %>% filter(splicing == "non-canonical") %>% pull(tool)) {
    sens_non_can = sens_anno %>% filter(tool == tool_non_can) %>% pull(sensitivity)
    
    median_diff = median_diff %>%
      bind_rows(tibble(tool_can, sens_can, tool_non_can, sens_non_can))
    
  }
}

median_diff = median_diff %>%
  mutate(sens_diff = sens_can - sens_non_can)

median_diff
median_diff %>% pull(sens_diff) %>% median()
## [1] 0.384535

Filtering

wilcox.test(sensitivity ~ BSJ_filter, data=sens_anno) 
## 
##  Wilcoxon rank sum exact test
## 
## data:  sensitivity by BSJ_filter
## W = 35, p-value = 0.4409
## alternative hypothesis: true location shift is not equal to 0